Model 1: Continuous Unconditioned Generation¶
Imports and Constants¶
import os
import random
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import torch
import torchaudio
import soundfile as sf
from IPython.display import Audio
def dummy_npwarn_decorator_factory():
def npwarn_decorator(x):
return x
return npwarn_decorator
np._no_nep50_warning = getattr(np, '_no_nep50_warning', dummy_npwarn_decorator_factory)
SAMPLE_RATE = 44100
EXPORT = './export'
random.seed(24)
torch.cuda.empty_cache()
Exploratory Data Analysis and Preprocessing¶
Our data comes from various sources:
- www.youtube.com/watch?v=lTRiuFIWV54
- archive.org/details/deep-spot-city-of-gamers-chill-gaming-studying-lofi-hip-hop-mix-1-hour
- https://archive.org/details/lo-fi-for-reading-manga-pb8ljm/Mabisyo+-+Lo%E2%80%8B-%E2%80%8BFi+For+Reading+Manga+-+04+Monkey+D.+Luffy.flac
- https://archive.org/details/motel-smoke-at-2-am
- https://archive.org/details/kalaido-hanging-lanterns_202101/flovry+-+car+radio.wav
With the help of some chatGPT code I converted all of the clips to .wav and split them into 11 second chunks. Uncomment and run todo preprocessing on new clips
# # code adjusted from chatGPT with prompts "write me a snippet of code to convert flac, wav, mp3, and m4a into 11 second long wav files with a sample rate of 44100 Hz", "write me a snippet of code to convert flac, wav, mp3, and m4a into 11 second long wav files with a sample rate of 44100 Hz", "discard the final chunk instead of padding=", "can you use torchaudio instead of pydub"
# # Settings
# input_dir = "./raw_audio/unfinished"
# output_dir = "./train_data"
# target_sr = 44100
# chunk_duration_s = 11 # 11 seconds
# chunk_duration_samples = target_sr * chunk_duration_s # 11 seconds in samples
# os.makedirs(output_dir, exist_ok=True)
# valid_exts = (".flac", ".wav", ".mp3", ".m4a")
# def load_audio(file_path, sr):
# """
# Load audio using torchaudio for supported formats or librosa for others.
# """
# ext = file_path.lower().split('.')[-1]
# if ext in ['flac', 'wav']:
# waveform, original_sr = torchaudio.load(file_path)
# else:
# waveform, original_sr = librosa.load(file_path, sr=None) # Load with original sampling rate
# waveform = torch.tensor(waveform) # Convert to torch tensor
# if original_sr != sr:
# # Resample if needed
# resampler = torchaudio.transforms.Resample(orig_freq=original_sr, new_freq=sr)
# waveform = resampler(waveform)
# return waveform
# def split_audio(waveform, chunk_length_samples):
# """
# Split the waveform into 11-second chunks.
# """
# num_chunks = waveform.size(1) // chunk_length_samples
# return [waveform[:, i * chunk_length_samples:(i + 1) * chunk_length_samples] for i in range(num_chunks)]
# for filename in os.listdir(input_dir):
# if filename.lower().endswith(valid_exts):
# input_path = os.path.join(input_dir, filename)
# base_name = os.path.splitext(filename)[0]
# try:
# # Load audio
# waveform = load_audio(input_path, target_sr)
# # Split into full-length chunks
# chunks = split_audio(waveform, chunk_duration_samples)
# # Save each chunk as a .wav file
# for i, chunk in enumerate(chunks):
# chunk_filename = f"{base_name}_chunk{i+1:03d}.wav"
# chunk_path = os.path.join(output_dir, chunk_filename)
# # Save chunk
# torchaudio.save(chunk_path, chunk, target_sr)
# print(f"✅ Saved: {chunk_filename}")
# except Exception as e:
# print(f"❌ Error processing {filename}: {e}")
Spectrogram conversion¶
## Code adjusted from chatGPT prompted with "using librosa loop through a batch of wav files and give me their spectrograms"
## Directory containing WAV files
#wav_dir = "./train_data"
#out_dir = "./train_data_spectrograms"
#wav_files = [f for f in os.listdir(wav_dir) if f.endswith(".wav")]
## Number of files to process
#max_files = 1000
#for i, file in enumerate(wav_files[:max_files]):
# file_path = os.path.join(wav_dir, file)
# # Load stereo audio (converts to mono by default)
# y, sr = librosa.load(file_path, sr=SAMPLE_RATE, mono=True)
# # Compute log-mel spectrogram
# S = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=2048, hop_length=512, n_mels=128)
# np.save(os.path.join(out_dir, f'{i}_mel.npy'), S)
Now let's take a look at and listen to the audio generated from these spectrograms
tensor_dir = "./train_data_spectrograms"
tensor_files = [f for f in os.listdir(tensor_dir) if f.endswith(".npy")]
max_samples = 4
for file in random.sample(tensor_files, max_samples):
file_path = os.path.join(tensor_dir, file)
# Load tensors
mel = np.load(file_path)
print(mel.shape)
# Print spectrogram
log_S = librosa.power_to_db(mel, ref=np.max)
# Plot
plt.figure(figsize=(10, 4))
librosa.display.specshow(log_S, sr=SAMPLE_RATE, hop_length=512, x_axis='time', y_axis='mel')
plt.title(f"Log-Mel Spectrogram: {file}")
plt.colorbar(format="%+2.0f dB")
plt.tight_layout()
plt.savefig(os.path.join(EXPORT, f"{file}_Log-Mel.png"))
# Spectrogram to audio
y = librosa.feature.inverse.mel_to_audio(mel, sr = SAMPLE_RATE, n_fft=2048, hop_length=512)
display(Audio(y, rate=SAMPLE_RATE))
sf.write(os.path.join(EXPORT, f"{file}.wav"), y, SAMPLE_RATE)
(128, 948)
(128, 948)
(128, 948)
(128, 948)
Model¶
I am following the guide on training a diffusion model from Diffusers https://huggingface.co/docs/diffusers/v0.33.1/en/tutorials/basic_training
I tested various hyperparameters and found the hyperparameters below to be the best for decently fast training.
from dataclasses import dataclass
@dataclass
class TrainingConfig:
image_size = (128, 128) #(128, 256) # the generated image resolution
train_batch_size = 8
eval_batch_size = 8 # how many images to sample during evaluation
num_epochs = 100
gradient_accumulation_steps = 1
learning_rate = 1e-4
lr_warmup_steps = 500
save_image_epochs = 20
save_model_epochs = 20
mixed_precision = "fp16" # `no` for float32, `fp16` for automatic mixed precision
output_dir = "mel-generation" # the model name locally and on the HF Hub
overwrite_output_dir = True # overwrite the old model when re-running the notebook
seed = 0
max_samples = -1
config = TrainingConfig()
Dataset Loader¶
from torch.utils.data import DataLoader, Dataset
class SpectrogramDataset(Dataset):
def __init__(self, spectrogram_dir, sample_size=(128, 896), max_samples = -1):
self.sample_size = sample_size
self.spectrogram_files = []
for i, f in enumerate(os.listdir(spectrogram_dir)):
if max_samples != -1 and max_samples < i :
break
if f.endswith(".npy"):
self.spectrogram_files.append(os.path.join(spectrogram_dir, f))
def __len__(self):
return len(self.spectrogram_files)
def __getitem__(self, idx):
# If idx is a slice, return a batch of spectrograms
if isinstance(idx, slice):
return [self.__getitem__(i) for i in range(*idx.indices(len(self)))]
# Fetch a single spectrogram and return it with batch dimension
spectrogram = np.load(self.spectrogram_files[idx])
# crop to match sample_size (I'm not center cropping bc I see no benefit here)
target_h, target_w = self.sample_size
spectrogram = spectrogram[:target_h, :target_w]
spectrogram = torch.tensor(spectrogram, dtype=torch.float32).unsqueeze(0) # Add batch dimension
return spectrogram
dataset = SpectrogramDataset('./train_data_spectrograms', sample_size = config.image_size, max_samples=config.max_samples)
train_dataloader = torch.utils.data.DataLoader(dataset, batch_size=config.train_batch_size, shuffle=True)
for i, data in enumerate(dataset[:2]):
display(data)
tensor([[[4.9801e+01, 9.5735e+01, 8.4604e+01, ..., 1.9413e+00,
1.8168e+00, 1.7897e+00],
[9.9013e+01, 3.3336e+02, 3.9260e+02, ..., 8.8301e+00,
8.7300e+00, 8.7972e+00],
[5.0395e+01, 8.8918e+01, 7.5389e+01, ..., 2.2583e+00,
2.5656e+00, 2.1667e+00],
...,
[1.4460e-04, 3.6040e-05, 1.7846e-09, ..., 1.0010e-09,
1.1840e-09, 1.4590e-09],
[1.4301e-04, 3.5646e-05, 1.0841e-09, ..., 1.1986e-09,
1.0729e-09, 1.0354e-09],
[1.4195e-04, 3.5379e-05, 1.2232e-09, ..., 1.3664e-09,
1.3587e-09, 1.7225e-09]]])
tensor([[[5.5976e-03, 8.9468e-04, 2.3048e-05, ..., 2.1976e-05,
5.0490e-05, 7.8640e-05],
[7.7224e-03, 1.4194e-03, 1.3512e-04, ..., 4.6296e-04,
1.8532e-04, 2.7411e-04],
[1.2057e-02, 7.2128e-03, 1.4132e-02, ..., 3.1841e-03,
1.7689e-03, 8.8070e-04],
...,
[3.7608e-05, 9.3756e-06, 1.4952e-09, ..., 1.6208e-09,
1.8723e-09, 1.3833e-09],
[3.7141e-05, 9.2574e-06, 1.6648e-09, ..., 1.6661e-09,
2.0163e-09, 1.9332e-09],
[3.6892e-05, 9.1962e-06, 1.6135e-09, ..., 1.3921e-09,
1.4930e-09, 1.5270e-09]]])
train_dataloader = torch.utils.data.DataLoader(dataset, batch_size=config.train_batch_size, shuffle=True)
Diffusion Model¶
from diffusers import UNet2DModel
model = UNet2DModel(
sample_size=config.image_size, # the target image resolution
in_channels=1, # the number of input channels, 3 for RGB images
out_channels=1, # the number of output channels
layers_per_block=2, # how many ResNet layers to use per UNet block
block_out_channels=(128, 948, 256, 256, 512, 512), # the number of output channels for each UNet block
down_block_types=(
"DownBlock2D", # a regular ResNet downsampling block
"DownBlock2D",
"DownBlock2D",
"DownBlock2D",
"AttnDownBlock2D", # a ResNet downsampling block with spatial self-attention
"DownBlock2D",
),
up_block_types=(
"UpBlock2D", # a regular ResNet upsampling block
"AttnUpBlock2D", # a ResNet upsampling block with spatial self-attention
"UpBlock2D",
"UpBlock2D",
"UpBlock2D",
"UpBlock2D",
),
norm_num_groups = 2,
)
# # Load from checkpoint TODO
# from diffusers import DDPMPipeline
# if config.checkpoint:
# checkpoint_dir = os.path.join(config.output_dir, "checkpoints", config.checkpoint)
# loaded_pipeline = DDMPipeline.from_pretrained(checkpoint_dir)
# model = loaded_pipeline.unet
# noise_scheduler = loaded_pipeline.scheduler
C:\Users\arthurchan\miniconda3\envs\cse153\lib\site-packages\tqdm\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
sample_image = dataset[0].unsqueeze(0)
print("Input shape:", sample_image.shape)
print("Output shape:", model(sample_image, timestep=0).sample.shape)
Input shape: torch.Size([1, 1, 128, 128]) Output shape: torch.Size([1, 1, 128, 128])
from PIL import Image
from diffusers import DDPMScheduler
noise_scheduler = DDPMScheduler(num_train_timesteps=1000)
noise = torch.randn(sample_image.shape)
timesteps = torch.LongTensor([50])
noisy_image = noise_scheduler.add_noise(sample_image, noise, timesteps)
noisy_image.shape
torch.Size([1, 1, 128, 128])
# visualize noisy sample
log_S = librosa.power_to_db(noisy_image[0,0,:,:], ref=np.max)
# Plot
plt.figure(figsize=(10, 4))
librosa.display.specshow(log_S, sr=SAMPLE_RATE, hop_length=512, x_axis='time', y_axis='mel')
plt.title("Noisy Log-Mel Spectrogram")
plt.colorbar(format="%+2.0f dB")
plt.tight_layout()
plt.show()
import torch.nn.functional as F
noise_pred = model(noisy_image, timesteps).sample
loss = F.mse_loss(noise_pred, noise)
loss
tensor(1.0781, grad_fn=<MseLossBackward0>)
Training¶
from diffusers.optimization import get_cosine_schedule_with_warmup
optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate)
lr_scheduler = get_cosine_schedule_with_warmup(
optimizer=optimizer,
num_warmup_steps=config.lr_warmup_steps,
num_training_steps=(len(train_dataloader) * config.num_epochs),
)
from diffusers import DDPMPipeline
from diffusers.utils import make_image_grid
def evaluate(config, epoch, pipeline):
# Sample some images from random noise (this is the backward diffusion process).
# The default pipeline output type is `List[PIL.Image]`
images = pipeline(
batch_size=config.eval_batch_size,
generator=torch.Generator(device='cpu').manual_seed(config.seed), # Use a separate torch generator to avoid rewinding the random state of the main training loop
output_type = "",
# num_inference_steps = 2 # david (undo)
).images
# Save the images
test_dir = os.path.join(config.output_dir, "samples")
os.makedirs(test_dir, exist_ok=True)
for i, image in enumerate(images):
image = np.squeeze(image)
np.save(os.path.join(test_dir, f"{epoch}_{i}.npy"), image)
# image_grid.save(.png")
from accelerate import Accelerator
from tqdm.auto import tqdm
from pathlib import Path
def train_loop(config, model, noise_scheduler, optimizer, train_dataloader, lr_scheduler):
# Initialize accelerator and tensorboard logging
accelerator = Accelerator(
mixed_precision=config.mixed_precision,
gradient_accumulation_steps=config.gradient_accumulation_steps,
log_with="tensorboard",
project_dir=os.path.join(config.output_dir, "logs"),
)
if accelerator.is_main_process:
assert config.output_dir is not None
os.makedirs(config.output_dir, exist_ok=True)
accelerator.init_trackers("train_example")
# Prepare everything
# There is no specific order to remember, you just need to unpack the
# objects in the same order you gave them to the prepare method.
model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
model, optimizer, train_dataloader, lr_scheduler
)
global_step = 0
# Now you train the model
for epoch in range(config.num_epochs):
progress_bar = tqdm(total=len(train_dataloader), disable=not accelerator.is_local_main_process)
progress_bar.set_description(f"Epoch {epoch}")
for step, batch in enumerate(train_dataloader):
clean_images = batch
# Sample noise to add to the images
noise = torch.randn(clean_images.shape, device=clean_images.device)
bs = clean_images.shape[0]
# Sample a random timestep for each image
timesteps = torch.randint(
0, noise_scheduler.config.num_train_timesteps, (bs,), device=clean_images.device,
dtype=torch.int64
)
# Add noise to the clean images according to the noise magnitude at each timestep
# (this is the forward diffusion process)
noisy_images = noise_scheduler.add_noise(clean_images, noise, timesteps)
with accelerator.accumulate(model):
# Predict the noise residual
noise_pred = model(noisy_images, timesteps, return_dict=False)[0]
loss = F.mse_loss(noise_pred, noise)
accelerator.backward(loss)
if accelerator.sync_gradients:
accelerator.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
progress_bar.update(1)
logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0], "step": global_step}
progress_bar.set_postfix(**logs)
accelerator.log(logs, step=global_step)
global_step += 1
# After each epoch you optionally sample some demo images with evaluate() and save the model
if accelerator.is_main_process:
pipeline = DDPMPipeline(unet=accelerator.unwrap_model(model), scheduler=noise_scheduler)
if (epoch + 1) % config.save_image_epochs == 0 or epoch == config.num_epochs - 1:
evaluate(config, epoch, pipeline)
if (epoch + 1) % config.save_model_epochs == 0 or epoch == config.num_epochs - 1:
pipeline.save_pretrained(os.path.join(config.output_dir, "checkpoints", f"epoch_{epoch + 1}"))
Run training¶
Uncomment and run to train the model
# from accelerate import notebook_launcher
# args = (config, model, noise_scheduler, optimizer, train_dataloader, lr_scheduler)
# notebook_launcher(train_loop, args, num_processes=1)
Evaluation¶
Below are the audio samples and the spectrograms of the diffusion model at various epochs {epoch}_{sample num}.npy
sample_dir = "./mel-generation/128x256"
# sample_dir = os.path.join(config.output_dir, "samples")
nplist_files = [f for f in os.listdir(sample_dir) if f.endswith(".npy")]
max_files = -1
for i, file in enumerate(nplist_files[:]):
file_path = os.path.join(sample_dir, file)
# Load tensors
mel = np.load(file_path)
print(file)
log_S = librosa.power_to_db(mel, ref=np.max)
# Plot
plt.figure(figsize=(10, 4))
librosa.display.specshow(log_S, sr=SAMPLE_RATE, hop_length=512, x_axis='time', y_axis='mel')
plt.title(f"Log-Mel Spectrogram: {file}")
plt.colorbar(format="%+2.0f dB")
plt.tight_layout()
plt.savefig(os.path.join(EXPORT, f"{file}_Log-Mel.png"))
# Spectrogram to audio
y = librosa.feature.inverse.mel_to_audio(mel, sr = SAMPLE_RATE, n_fft=2048, hop_length=512)
display(Audio(y, rate=SAMPLE_RATE))
sf.write(os.path.join(EXPORT, f"{file}.wav"), y, SAMPLE_RATE)
19_0.npy
19_1.npy
19_2.npy
19_3.npy
39_0.npy
39_1.npy
39_2.npy
39_3.npy
59_0.npy
59_1.npy
59_2.npy
59_3.npy
79_0.npy
79_1.npy
79_2.npy
79_3.npy
99_0.npy
99_1.npy
99_2.npy
99_3.npy
Baseline comparison¶
baseline = np.random.rand(128,256)
baseline = baseline * SAMPLE_RATE / 2
log_S = librosa.power_to_db(baseline, ref=np.max)
file = "baseline.npy"
# Plot
plt.figure(figsize=(10, 4))
librosa.display.specshow(log_S, sr=SAMPLE_RATE, hop_length=512, x_axis='time', y_axis='mel')
plt.title(f"Log-Mel Spectrogram: {file}")
plt.colorbar(format="%+2.0f dB")
plt.tight_layout()
plt.savefig(os.path.join(EXPORT, f"{file}_Log-Mel.png"))
# Spectrogram to audio
y = librosa.feature.inverse.mel_to_audio(mel, sr = SAMPLE_RATE, n_fft=2048, hop_length=512)
display(Audio(y, rate=SAMPLE_RATE))
sf.write(os.path.join(EXPORT, f"{file}.wav"), y, SAMPLE_RATE)
Model 2: Symbolic Unconditioned Generation¶
Install and load required libraries¶
# !pip install miditok
# !pip install mido
#!pip install symusic
#!pip install glob
#!pip install torch
# !unzip data.zip
from google.colab import files
import glob
import random
from typing import List
from collections import defaultdict
import numpy as np
from numpy.random import choice
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from mido import MidiFile
from symusic import Score
from miditok import REMI, TokenizerConfig
Exploratory Data Analysis¶
train_files = glob.glob("./data/train/*.mid")
test_files = glob.glob("./data/test/*.mid")
def get_midi_len(file):
try:
mid = MidiFile(file)
return mid.length
except Exception as e:
print(e)
return 0
num_samples = len(train_files)
print(f"Number of training samples: {num_samples}")
total_length = sum([get_midi_len(file) for file in train_files])
print(f"Total length of training samples in ticks: {total_length}")
avg_length = total_length / num_samples
print(f"Avg length of training samples in ticks: {avg_length}")
Number of training samples: 638 Total length of training samples in ticks: 72914.93901779644 Avg length of training samples in ticks: 114.28673827240821
Model: Second Order Markov Chain¶
This model serves as a baseline of comparison for our LSTM model.
Preprocessing¶
Train Midi Tokenizer
config = TokenizerConfig(num_velocities=1, use_chords=False, use_programs=True)
tokenizer = REMI(config)
tokenizer.train(vocab_size=1000, files_paths=train_files)
tokenizer.save("tokenizer.json")
/usr/local/lib/python3.11/dist-packages/miditok/tokenizations/remi.py:88: UserWarning: Attribute controls are not compatible with 'config.one_token_stream_for_programs' and multi-vocabulary tokenizers. Disabling them from the config. super().__init__(tokenizer_config, params)
Construct PyTorch Dataset and Dataloaders
class MIDIDataset(Dataset):
def __init__(self, file_paths: List[str], tokenizer):
self.tokenizer = tokenizer
self.file_paths = file_paths
def __len__(self):
return len(self.file_paths)
def __getitem__(self, idx):
midi = Score(self.file_paths[idx])
tokens = self.tokenizer(midi)
return np.array(tokens)
train_dataset = MIDIDataset(train_files, tokenizer)
test_dataset = MIDIDataset(test_files, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)
Model¶
class SecondOrderMarkovChain:
def __init__(self):
self.transitions = defaultdict(lambda: defaultdict(int))
self.probabilities = defaultdict(lambda: defaultdict(float))
def train(self, train_loader):
for sequence in train_loader:
sequence = sequence[0].numpy().astype(int)
for i in range(len(sequence) - 2):
state1, state2 = sequence[i], sequence[i + 1]
next_state = sequence[i + 2]
self.transitions[(state1, state2)][next_state] += 1
for (state1, state2), next_states in self.transitions.items():
total = sum(next_states.values())
for next_state, count in next_states.items():
self.probabilities[(state1, state2)][next_state] = count / total
return self.probabilities
def generate(self, test_sequence, num_predictions=1):
test_sequence = test_sequence[0].numpy().astype(int)
results = [test_sequence[0], test_sequence[1]]
for i in range(100):
if (results[-2], results[-1]) not in self.probabilities:
break
else:
probs = self.probabilities[(results[-2], results[-1])]
states = list(probs.keys())
probabilities = list(probs.values())
if not states:
break
try:
predictions = np.random.choice(states, size=num_predictions, p=probabilities)
except:
break
results.append(predictions[0])
return results
Training¶
model = SecondOrderMarkovChain()
model.train(train_loader)
predictions = []
for test_sequence in test_loader:
predictions.append(model.generate(test_sequence))
for i, prediction in enumerate(predictions):
output_score = tokenizer.decode(torch.Tensor(prediction))
output_score.dump_midi(f"markov/{i}.mid")
Download Output
# !zip -r markov.zip ./markov
# files.download("markov.zip")
Model: LSTM¶
This is the primary model I will be exploring
Preprocessing¶
from miditok.pytorch_data import DatasetMIDI, DataCollator
tokenizer = REMI() # using defaults parameters (constants.py)
train_dataset = DatasetMIDI(
files_paths=train_files,
tokenizer=tokenizer,
max_seq_len=1024,
bos_token_id=tokenizer["BOS_None"],
eos_token_id=tokenizer["EOS_None"],
)
test_dataset = DatasetMIDI(
files_paths=test_files,
tokenizer=tokenizer,
max_seq_len=1024,
bos_token_id=tokenizer["BOS_None"],
eos_token_id=tokenizer["EOS_None"],
)
collator = DataCollator(tokenizer.pad_token_id)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collator)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, collate_fn=collator)
len(train_loader), len(test_loader)
(160, 18)
Model¶
class MusicRNN(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, dropout):
super(MusicRNN, self).__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.rnn = nn.LSTM(
input_size=embedding_dim,
hidden_size=hidden_dim,
num_layers=num_layers,
batch_first=True,
dropout=dropout
)
self.fc = nn.Linear(hidden_dim, vocab_size)
def forward(self, x, hidden=None):
# x: (batch_size, seq_length)
x = self.embedding(x) # (batch_size, seq_length, embedding_dim)
out, hidden = self.rnn(x, hidden) # out: (batch_size, seq_length, hidden_dim)
out = self.fc(out) # (batch_size, seq_length, vocab_size)
return out, hidden
Training¶
def train(model, train_loader, val_loader, vocab_size, num_epochs=20, lr=0.001, device='cuda'):
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)
for epoch in range(num_epochs):
# --------- Training ---------
model.train()
total_train_loss = 0
for batch in train_loader:
batch = batch['input_ids'].to(device) # (batch_size, seq_length)
inputs = batch[:, :-1]
targets = batch[:, 1:]
optimizer.zero_grad()
outputs, _ = model(inputs)
outputs = outputs.reshape(-1, vocab_size)
targets = targets.reshape(-1)
loss = criterion(outputs, targets)
loss.backward()
optimizer.step()
total_train_loss += loss.item()
avg_train_loss = total_train_loss / len(train_loader)
# --------- Validation ---------
model.eval()
total_val_loss = 0
with torch.no_grad():
for batch in val_loader:
batch = batch['input_ids'].to(device)
inputs = batch[:, :-1]
targets = batch[:, 1:]
outputs, _ = model(inputs)
outputs = outputs.reshape(-1, vocab_size)
targets = targets.reshape(-1)
loss = criterion(outputs, targets)
total_val_loss += loss.item()
avg_val_loss = total_val_loss / len(val_loader)
print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")
# Example usage
if __name__ == "__main__":
vocab_size = tokenizer.vocab_size
embedding_dim = 256
hidden_dim = 512
num_layers = 2
dropout = .3
model = MusicRNN(vocab_size, embedding_dim, hidden_dim, num_layers, dropout)
train(model, train_loader, test_loader, vocab_size, num_epochs=15)
Epoch 1/15 | Train Loss: 2.7964 | Val Loss: 1.9718 Epoch 2/15 | Train Loss: 1.7968 | Val Loss: 1.6201 Epoch 3/15 | Train Loss: 1.5021 | Val Loss: 1.4396 Epoch 4/15 | Train Loss: 1.3268 | Val Loss: 1.3043 Epoch 5/15 | Train Loss: 1.2244 | Val Loss: 1.2479 Epoch 6/15 | Train Loss: 1.1488 | Val Loss: 1.2160 Epoch 7/15 | Train Loss: 1.0998 | Val Loss: 1.1965 Epoch 8/15 | Train Loss: 1.0568 | Val Loss: 1.1939 Epoch 9/15 | Train Loss: 1.0244 | Val Loss: 1.1704 Epoch 10/15 | Train Loss: 0.9711 | Val Loss: 1.1626 Epoch 11/15 | Train Loss: 0.9323 | Val Loss: 1.1570 Epoch 12/15 | Train Loss: 0.8970 | Val Loss: 1.1454 Epoch 13/15 | Train Loss: 0.8553 | Val Loss: 1.1402 Epoch 14/15 | Train Loss: 0.8196 | Val Loss: 1.1539 Epoch 15/15 | Train Loss: 0.7933 | Val Loss: 1.1445
Train 15 Epoch Model¶
def sample(model, start_token, max_length=100, temperature=1.0, device='cuda'):
model = model.to(device)
model.eval()
generated = [start_token]
input_token = torch.tensor([[start_token]], device=device) # (1, 1)
hidden = None
for _ in range(max_length):
output, hidden = model(input_token, hidden) # output: (1, 1, vocab_size)
output = output[:, -1, :] # take the last output
output = output / temperature # adjust randomness
probs = F.softmax(output, dim=-1) # (1, vocab_size)
next_token = torch.multinomial(probs, num_samples=1).item()
generated.append(next_token)
if next_token == 2 or next_token == 0: # reach end of sequence
break
input_token = torch.tensor([[next_token]], device=device)
return generated
start_token = tokenizer.special_tokens_ids[1]
generated_sequence = sample(model, start_token, max_length=1024)
print("Generated token sequence:")
print(generated_sequence)
Generated token sequence: [1, 4, 189, 44, 124, 132, 197, 49, 124, 128, 201, 47, 124, 128, 205, 51, 124, 128, 209, 49, 124, 132, 217, 44, 124, 126, 219, 44, 124, 126, 4, 189, 42, 124, 128, 193, 47, 124, 128, 197, 46, 124, 136, 209, 51, 124, 128, 213, 47, 124, 128, 217, 47, 124, 128, 4, 189, 42, 124, 132, 197, 40, 124, 132, 205, 40, 124, 132, 213, 37, 124, 132, 4, 189, 38, 124, 126, 191, 34, 124, 126, 193, 35, 124, 126, 195, 30, 124, 126, 197, 18, 124, 140, 213, 30, 124, 126, 217, 43, 124, 126, 219, 42, 124, 126, 4, 189, 40, 124, 140, 47, 124, 140, 205, 41, 124, 140, 42, 124, 140, 4, 189, 66, 124, 140, 38, 124, 138, 205, 40, 124, 140, 44, 124, 140, 4, 189, 53, 124, 156, 4, 189, 44, 124, 156, 4, 189, 44, 124, 126, 191, 42, 124, 126, 193, 41, 124, 126, 195, 37, 124, 126, 197, 35, 124, 126, 199, 37, 124, 126, 201, 38, 124, 126, 203, 38, 124, 126, 205, 30, 124, 126, 207, 42, 124, 126, 209, 46, 124, 126, 211, 46, 124, 126, 213, 25, 124, 126, 215, 46, 124, 126, 217, 42, 124, 126, 219, 42, 124, 126, 4, 189, 40, 124, 126, 191, 43, 124, 126, 193, 40, 124, 126, 195, 42, 124, 126, 201, 44, 124, 126, 203, 47, 124, 126, 205, 44, 124, 126, 207, 48, 124, 126, 209, 54, 124, 126, 211, 52, 124, 126, 213, 50, 124, 126, 215, 46, 124, 126, 217, 47, 124, 126, 219, 49, 124, 126, 4, 189, 47, 124, 126, 191, 45, 124, 126, 193, 49, 124, 126, 195, 47, 124, 126, 197, 116, 124, 126, 199, 47, 124, 126, 201, 44, 124, 126, 203, 42, 124, 126, 205, 47, 124, 126, 207, 52, 124, 126, 209, 54, 124, 126, 211, 44, 124, 126, 213, 47, 124, 132, 4, 189, 46, 124, 132, 197, 49, 124, 126, 199, 47, 124, 126, 201, 49, 124, 126, 203, 52, 124, 126, 207, 45, 124, 126, 209, 47, 124, 126, 211, 47, 124, 126, 213, 51, 124, 126, 215, 50, 124, 126, 217, 46, 124, 126, 219, 45, 124, 126, 4, 189, 45, 124, 128, 193, 43, 124, 126, 219, 38, 124, 126, 205, 42, 124, 126, 207, 42, 124, 126, 209, 47, 124, 128, 213, 41, 124, 132, 4, 189, 35, 124, 126, 191, 40, 124, 126, 193, 47, 124, 126, 195, 39, 124, 126, 197, 40, 124, 140, 213, 50, 124, 132, 4, 193, 43, 124, 126, 195, 42, 124, 126, 197, 45, 124, 126, 199, 43, 124, 126, 201, 41, 124, 126, 203, 40, 124, 126, 205, 45, 124, 126, 207, 49, 124, 126, 209, 39, 124, 126, 211, 45, 124, 126, 213, 43, 124, 126, 215, 45, 124, 126, 217, 44, 124, 126, 219, 47, 124, 126, 4, 189, 41, 124, 132, 197, 45, 124, 132, 205, 44, 124, 132, 213, 40, 124, 132, 4, 189, 43, 124, 132, 197, 42, 124, 132, 205, 44, 124, 132, 4, 189, 44, 124, 140, 205, 46, 124, 140, 4, 189, 47, 124, 140, 205, 47, 124, 140, 4, 189, 49, 124, 140, 205, 37, 124, 140, 4, 189, 35, 124, 140, 205, 37, 124, 140, 4, 189, 37, 124, 140, 4, 4, 189, 63, 124, 132, 56, 124, 140, 197, 59, 124, 132, 205, 59, 124, 132, 58, 124, 132, 213, 59, 124, 132, 61, 124, 132, 4, 189, 63, 124, 126, 67, 124, 126, 191, 66, 124, 126, 59, 124, 126, 193, 61, 124, 126, 66, 124, 126, 195, 64, 124, 126, 64, 124, 126, 199, 66, 124, 126, 66, 124, 126, 201, 63, 124, 132, 52, 124, 132, 46, 124, 132, 205, 66, 124, 132, 209, 66, 124, 126, 74, 124, 126, 211, 66, 124, 126, 66, 124, 126, 213, 75, 124, 126, 215, 66, 124, 126, 78, 124, 126, 217, 66, 124, 126, 66, 124, 126, 219, 65, 124, 126, 71, 124, 126, 4, 189, 66, 124, 126, 71, 124, 126, 193, 71, 124, 126, 63, 124, 126, 195, 68, 124, 126, 69, 124, 126, 197, 66, 124, 126, 64, 124, 126, 199, 69, 124, 126, 59, 124, 126, 201, 68, 123, 126, 69, 124, 126, 203, 64, 124, 126, 71, 124, 126, 205, 66, 124, 126, 66, 124, 126, 207, 65, 124, 126, 68, 124, 126, 209, 69, 124, 126, 64, 124, 126, 211, 68, 124, 126, 65, 124, 126, 215, 63, 124, 126, 66, 124, 126, 217, 61, 124, 126, 4, 191, 67, 124, 126, 72, 124, 126, 193, 73, 124, 126, 124, 124, 126, 195, 64, 124, 126, 57, 124, 126, 197, 71, 124, 126, 55, 124, 126, 199, 59, 124, 126, 56, 124, 126, 201, 64, 124, 126, 65, 124, 126, 203, 60, 124, 126, 69, 124, 126, 205, 66, 124, 126, 69, 124, 126, 207, 62, 124, 126, 59, 124, 126, 209, 65, 124, 126, 64, 124, 126, 211, 59, 124, 126, 213, 64, 124, 126, 69, 124, 126, 215, 67, 124, 126, 217, 65, 124, 126, 57, 124, 126, 219, 58, 124, 126, 62, 124, 126, 4, 189, 48, 124, 126, 52, 124, 126, 191, 55, 124, 126, 55, 124, 126, 193, 52, 124, 126, 57, 124, 126, 195, 52, 124, 126, 53, 124, 126, 197, 57, 124, 126, 57, 124, 126, 199, 52, 124, 126, 57, 124, 126, 201, 53, 124, 126, 57, 124, 126, 203, 57, 124, 126, 59, 124, 126, 205, 55, 124, 126, 57, 124, 126, 207, 55, 124, 126, 57, 124, 126, 209, 57, 124, 126, 55, 124, 126, 211, 56, 124, 128, 57, 124, 126, 215, 52, 124, 126, 57, 124, 126, 4, 189, 53, 124, 136, 57, 124, 140, 56, 124, 130, 201, 56, 124, 132, 61, 124, 132, 209, 51, 124, 128, 54, 124, 128, 213, 53, 124, 132, 56, 124, 132, 4, 189, 59, 124, 128, 56, 124, 128, 193, 52, 124, 128, 54, 124, 128, 57, 124, 128, 197, 59, 124, 128, 51, 124, 128, 201, 51, 124, 128, 61, 124, 128, 205, 58, 124, 128, 53, 124, 128, 61, 124, 128, 209, 56, 124]
for i, start_token in enumerate(random.sample(list(tokenizer.vocab.values()), 10)):
generated_sequence = sample(model, start_token, max_length=1024)
output_score = tokenizer.tokens_to_midi([generated_sequence])
output_score.dump_midi(f"rnn_15/rnn_{i}.mid")
<ipython-input-19-2ec83c333e95>:3: UserWarning: miditok: The `tokens_to_midi` method had been renamed `decode`. It is now depreciated and will be removed in future updates. output_score = tokenizer.tokens_to_midi([generated_sequence])
# !zip -r rnn_15.zip ./rnn_15
# files.download("rnn_15.zip")
Train 25 Epoch Model¶
train(model, train_loader, test_loader, vocab_size, num_epochs=10)
Epoch 1/10 | Train Loss: 0.7644 | Val Loss: 1.1469 Epoch 2/10 | Train Loss: 0.7250 | Val Loss: 1.1617 Epoch 3/10 | Train Loss: 0.6925 | Val Loss: 1.1655 Epoch 4/10 | Train Loss: 0.6596 | Val Loss: 1.1808 Epoch 5/10 | Train Loss: 0.6267 | Val Loss: 1.1996 Epoch 6/10 | Train Loss: 0.6004 | Val Loss: 1.2031 Epoch 7/10 | Train Loss: 0.5703 | Val Loss: 1.2232 Epoch 8/10 | Train Loss: 0.5399 | Val Loss: 1.2488 Epoch 9/10 | Train Loss: 0.5151 | Val Loss: 1.2462 Epoch 10/10 | Train Loss: 0.4844 | Val Loss: 1.2806
for i, start_token in enumerate(random.sample(list(tokenizer.vocab.values()), 10)):
generated_sequence = sample(model, start_token, max_length=1024)
output_score = tokenizer.tokens_to_midi([generated_sequence])
output_score.dump_midi(f"rnn_25/rnn_{i}.mid")
<ipython-input-23-b088643153c2>:3: UserWarning: miditok: The `tokens_to_midi` method had been renamed `decode`. It is now depreciated and will be removed in future updates. output_score = tokenizer.tokens_to_midi([generated_sequence])
# !zip -r rnn_25.zip ./rnn_25
# files.download("rnn_25.zip")
Train 50 Epoch Model¶
train(model, train_loader, test_loader, vocab_size, num_epochs=25)
Epoch 1/25 | Train Loss: 0.4660 | Val Loss: 1.3109 Epoch 2/25 | Train Loss: 0.4301 | Val Loss: 1.3256 Epoch 3/25 | Train Loss: 0.4093 | Val Loss: 1.3385 Epoch 4/25 | Train Loss: 0.3873 | Val Loss: 1.3756 Epoch 5/25 | Train Loss: 0.3738 | Val Loss: 1.4017 Epoch 6/25 | Train Loss: 0.3531 | Val Loss: 1.4187 Epoch 7/25 | Train Loss: 0.3376 | Val Loss: 1.4391 Epoch 8/25 | Train Loss: 0.3207 | Val Loss: 1.4662 Epoch 9/25 | Train Loss: 0.3036 | Val Loss: 1.4819 Epoch 10/25 | Train Loss: 0.2872 | Val Loss: 1.5067 Epoch 11/25 | Train Loss: 0.2718 | Val Loss: 1.5175 Epoch 12/25 | Train Loss: 0.2532 | Val Loss: 1.5516 Epoch 13/25 | Train Loss: 0.2442 | Val Loss: 1.5718 Epoch 14/25 | Train Loss: 0.2355 | Val Loss: 1.5953 Epoch 15/25 | Train Loss: 0.2303 | Val Loss: 1.6487 Epoch 16/25 | Train Loss: 0.2297 | Val Loss: 1.6254 Epoch 17/25 | Train Loss: 0.2108 | Val Loss: 1.6725 Epoch 18/25 | Train Loss: 0.1975 | Val Loss: 1.7077 Epoch 19/25 | Train Loss: 0.1855 | Val Loss: 1.7225 Epoch 20/25 | Train Loss: 0.1789 | Val Loss: 1.7439 Epoch 21/25 | Train Loss: 0.1782 | Val Loss: 1.7471 Epoch 22/25 | Train Loss: 0.1725 | Val Loss: 1.7761 Epoch 23/25 | Train Loss: 0.1651 | Val Loss: 1.7997 Epoch 24/25 | Train Loss: 0.1592 | Val Loss: 1.8070 Epoch 25/25 | Train Loss: 0.1531 | Val Loss: 1.8271
for i, start_token in enumerate(random.sample(list(tokenizer.vocab.values()), 10)):
generated_sequence = sample(model, start_token, max_length=1024)
output_score = tokenizer.tokens_to_midi([generated_sequence])
output_score.dump_midi(f"rnn_50/rnn_{i}.mid")
<ipython-input-26-ce489eda5a3f>:3: UserWarning: miditok: The `tokens_to_midi` method had been renamed `decode`. It is now depreciated and will be removed in future updates. output_score = tokenizer.tokens_to_midi([generated_sequence])
!zip -r rnn_50.zip ./rnn_50
files.download("rnn_50.zip")
adding: rnn_50/ (stored 0%) adding: rnn_50/rnn_5.mid (deflated 60%) adding: rnn_50/rnn_0.mid (deflated 67%) adding: rnn_50/rnn_1.mid (deflated 63%) adding: rnn_50/rnn_3.mid (deflated 59%) adding: rnn_50/rnn_2.mid (deflated 79%) adding: rnn_50/rnn_6.mid (deflated 66%) adding: rnn_50/rnn_4.mid (deflated 64%) adding: rnn_50/rnn_7.mid (deflated 62%) adding: rnn_50/rnn_8.mid (deflated 68%) adding: rnn_50/rnn_9.mid (deflated 70%)